# This software has been approved for release by the U.S. Geological Survey (USGS). Although the software has been subjected to rigorous review, 
# the USGS reserves the right to update the software as needed pursuant to further analysis and review. No warranty, expressed or implied, 
# is made by the USGS or the U.S. Government as to the functionality of the software and related material nor shall the fact of release constitute any such warranty. 
# Furthermore, the software is released on condition that neither the USGS nor the U.S. Government shall be held liable for any damages resulting from 
# its authorized or unauthorized use.
library(gbm)
library(caret)
library(doParallel)  
library(gmodels)
library(dismo)
library(fs)

# Read input datasets
model.data <- read.csv("16.csv")
#############################################################
### parameter tuning by cross validation in parallel mode ###
#############################################################

# set up tuning grid with ranges of values for # of trees, interaction depth/tree complexity, shrinkage/learning rate, and minimum # of obs per node
grid<-expand.grid(n.trees = seq(50,30000,by=100), 
                  interaction.depth = seq(2,8,by=1), 
                  shrinkage = c(0.0005,0.001,0.005,0.01,0.05,0.1),
                  n.minobsinnode=seq(2,10,by=2))           

# set seed to run fully reproducible model in parallel mode using caret          
set.seed(714)
seeds <- vector(mode = "list", length = 11) # length is = (n_repeats*nresampling)+1
for(i in 1:10) seeds[[i]]<- sample.int(n=10000, size=nrow(grid)/10) ## NOTE: LAST VALUE TYPICALLY - BUT NOT ALWAYS - IS NUMBER OF ROWS IN TUNING GRID
# for boosted models such as gbm, use levels of interaction.depth X shrinkage X n.minobsinnode to get last value in the preceding line
seeds[[11]] <- sample.int(10000, 1) # for the last model

# Provide computational specifics for train function to evaluate grid of tuning parameters for the 10-predictor BRT model
fitControl <- caret::trainControl(method = "repeatedcv", number = 10, repeats = 1, seeds = seeds, returnData = TRUE, verboseIter = FALSE) 

cl <- makeCluster(detectCores()) 
registerDoParallel(cl) ## register package for use by caret, which has foreach package built-in
getDoParWorkers()  ## shows number available threads, 2 per processor typically
# open Windows Task Manager/Performance to see CPU usage for all threads when running model in parallel

# Use the train function in caret to evaluate the full grid of tuning parameters
Tune_set_accum <- caret::train(TOC~., data=model.data,
                                     method="gbm",
                                     bag.fraction=.5,
                                     tuneGrid=grid,
                                     trControl=fitControl)
# Fit final model and selected tuning parameters
# we opted for the smaller interaction depth/tree complexity to minimize over-fitting
set.seed(714)
final_brt <- gbm(TOC ~ ., 
                 data = model.data,
                 distribution = "gaussian",
                 bag.fraction = .5,
                 n.trees = 250,
                 interaction.depth = 4,
                 shrinkage = 0.05,
                 n.minobsinnode = 2,
                 cv.folds = 10,
                 verbose = FALSE)

summary(final_brt)
##############################################################################################